Loading libraries
library(knitr)
library(ggplot2)
library(plyr)
library(dplyr)
library(tidyr)
library(corrplot)
library(caret)
library(gridExtra)
library(scales)
library(Rmisc)
library(ggrepel)
library(randomForest)
library(psych)
library(xgboost)
library(stringr)
library(GGally)
library(psych)
library(lubridate)
library(igraph)
library(ggraph)
library(reshape2)
Reading data
df <- read.csv("IMDB_data_Fall_2024.csv")
dim(df)
## [1] 1930 42
str(df)
## 'data.frame': 1930 obs. of 42 variables:
## $ movie_title : chr "August: Osage County" "Radio" "Coach Carter" "The Possession" ...
## $ movie_id : int 2 12 15 20 22 23 26 31 38 39 ...
## $ imdb_link : chr "http://www.imdb.com/title/tt1322269/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt0316465/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt0393162/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt0431021/?ref_=fn_tt_tt_1" ...
## $ imdb_score : num 7.3 6.9 7.2 5.9 7.6 6.4 7.1 8.1 7.1 6.5 ...
## $ movie_budget : int 25000000 35000000 30000000 14000000 8000000 20000000 22700000 25000000 45000000 40000000 ...
## $ release_day : int 10 24 14 20 22 17 24 21 21 14 ...
## $ release_month : chr "Jan" "Oct" "Jan" "Aug" ...
## $ release_year : int 2014 2003 2005 2012 1979 2006 1987 2007 1998 2007 ...
## $ duration : int 121 109 136 92 112 105 96 122 110 95 ...
## $ language : chr "English" "English" "English" "English" ...
## $ country : chr "USA" "USA" "USA" "USA" ...
## $ maturity_rating : chr "R" "PG" "PG-13" "PG-13" ...
## $ aspect_ratio : num 2.35 1.85 2.35 2.35 1.85 1.85 1.85 2.35 2.35 1.85 ...
## $ distributor : chr "The Weinstein Company" "Columbia Pictures Corporation" "Paramount Pictures" "Lionsgate" ...
## $ nb_news_articles : int 2141 331 223 620 97 173 408 4135 1723 378 ...
## $ director : chr "John Wells" "Michael Tollin" "Thomas Carter" "Ole Bornedal" ...
## $ actor1 : chr "Benedict Cumberbatch" "Alfre Woodard" "Channing Tatum" "Kyra Sedgwick" ...
## $ actor1_star_meter : int 259 2735 573 2047 102 573 12294 628 547 358742 ...
## $ actor2 : chr "Meryl Streep" "Riley Smith" "Rick Gonzalez" "Madison Davenport" ...
## $ actor2_star_meter : int 559 3915 4793 1769 5062 370 13732 2450 1054 3086 ...
## $ actor3 : chr "Julia Roberts" "Debra Winger" "Robert Ri'chard" "Natasha Calis" ...
## $ actor3_star_meter : int 513 1845 6729 11963 5451 3711 8419 3592 3001 642 ...
## $ colour_film : chr "Color" "Color" "Color" "Color" ...
## $ genres : chr "Drama" "Biography|Drama|Sport" "Drama|Sport" "Horror|Thriller" ...
## $ nb_faces : int 3 1 0 0 0 0 2 0 1 4 ...
## $ plot_keywords : chr "based on play|incestuous relationship|pedophilia|secret|teenage daughter" "coach|football|football coach|high school|radio" "basketball|basketball coach|coach|contract|high school" "basketball coach|box|jewish|rabbi|yard sale" ...
## $ action : int 0 0 0 0 0 0 0 0 1 0 ...
## $ adventure : int 0 0 0 0 0 0 1 0 0 0 ...
## $ scifi : int 0 0 0 0 0 0 1 0 0 0 ...
## $ thriller : int 0 0 0 1 0 0 0 1 0 0 ...
## $ musical : int 0 0 0 0 0 0 0 0 0 1 ...
## $ romance : int 0 0 0 0 0 1 0 0 0 1 ...
## $ western : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sport : int 0 1 1 0 0 0 0 0 0 0 ...
## $ horror : int 0 0 0 1 0 0 0 0 1 0 ...
## $ drama : int 1 1 1 0 1 0 0 1 0 0 ...
## $ war : int 0 0 0 0 0 0 0 0 0 0 ...
## $ animation : int 0 0 0 0 0 0 0 0 0 0 ...
## $ crime : int 0 0 0 0 1 0 0 1 0 0 ...
## $ movie_meter_IMDBpro: int 4000 8556 3940 5452 4722 2446 2294 513 697 6854 ...
## $ cinematographer : chr "Adriano Goldman" "Don Burgess" "Sharone Meir" "Dan Laustsen" ...
## $ production_company : chr "The Weinstein Company" "Revolution Studios" "Coach Carter" "Ghost House Pictures" ...
summary(df)
## movie_title movie_id imdb_link imdb_score
## Length:1930 Min. : 2 Length:1930 Min. :1.900
## Class :character 1st Qu.: 2528 Class :character 1st Qu.:5.900
## Mode :character Median : 5802 Mode :character Median :6.600
## Mean : 7067 Mean :6.512
## 3rd Qu.:10604 3rd Qu.:7.300
## Max. :21838 Max. :9.300
## movie_budget release_day release_month release_year
## Min. : 560000 Min. : 1.00 Length:1930 Min. :1936
## 1st Qu.: 8725000 1st Qu.: 9.00 Class :character 1st Qu.:1997
## Median :18000000 Median :17.00 Mode :character Median :2004
## Mean :20973774 Mean :15.95 Mean :2001
## 3rd Qu.:30000000 3rd Qu.:23.00 3rd Qu.:2010
## Max. :55000000 Max. :30.00 Max. :2018
## duration language country maturity_rating
## Min. : 37.0 Length:1930 Length:1930 Length:1930
## 1st Qu.: 96.0 Class :character Class :character Class :character
## Median :106.0 Mode :character Mode :character Mode :character
## Mean :109.7
## 3rd Qu.:118.0
## Max. :330.0
## aspect_ratio distributor nb_news_articles director
## Min. :1.180 Length:1930 Min. : 0.0 Length:1930
## 1st Qu.:1.850 Class :character 1st Qu.: 78.0 Class :character
## Median :2.350 Mode :character Median : 286.0 Mode :character
## Mean :2.096 Mean : 770.6
## 3rd Qu.:2.350 3rd Qu.: 845.5
## Max. :2.760 Max. :60620.0
## actor1 actor1_star_meter actor2 actor2_star_meter
## Length:1930 Min. : 9 Length:1930 Min. : 3
## Class :character 1st Qu.: 505 Class :character 1st Qu.: 1895
## Mode :character Median : 1888 Mode :character Median : 3986
## Mean : 21190 Mean : 17114
## 3rd Qu.: 4665 3rd Qu.: 7667
## Max. :8342201 Max. :5529461
## actor3 actor3_star_meter colour_film genres
## Length:1930 Min. : 8 Length:1930 Length:1930
## Class :character 1st Qu.: 3075 Class :character Class :character
## Mode :character Median : 5856 Mode :character Mode :character
## Mean : 35469
## 3rd Qu.: 12250
## Max. :6292982
## nb_faces plot_keywords action adventure
## Min. : 0.00 Length:1930 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.00 Class :character 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 1.00 Mode :character Median :0.0000 Median :0.0000
## Mean : 1.44 Mean :0.2005 Mean :0.1264
## 3rd Qu.: 2.00 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :31.00 Max. :1.0000 Max. :1.0000
## scifi thriller musical romance
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.00000 Median :0.0000
## Mean :0.1083 Mean :0.2979 Mean :0.07047 Mean :0.2451
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## western sport horror drama
## Min. :0.00000 Min. :0.00000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.000 Median :1.0000
## Mean :0.01762 Mean :0.04819 Mean :0.113 Mean :0.5492
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.00000 Max. :1.000 Max. :1.0000
## war animation crime movie_meter_IMDBpro
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. : 71
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.: 2836
## Median :0.00000 Median :0.00000 Median :0.0000 Median : 5406
## Mean :0.03627 Mean :0.01036 Mean :0.2161 Mean : 11612
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.: 10198
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :849550
## cinematographer production_company
## Length:1930 Length:1930
## Class :character Class :character
## Mode :character Mode :character
##
##
##
There are 41 predictors and 1 target variable (imdb_score). All of them match their expected data types, so there’s no need to modify the datatype for any feature.
drop some columns
First we drop some columns that we won’t use in our model at all. After getting rid of the titles, urls, and plot_keywords, there’re 38 predictors and the target variable imdb_score.
df$movie_title <- NULL
df$imdb_link <- NULL
df$plot_keywords <- NULL
dim(df)
## [1] 1930 39
Before jumping into EDA analysis, we’ve outlined some business questions to guide us through the process. By answering these questions, we hope to uncover insights that will inform our feature selection and interactions, potentially improving the performance of our predictive model.
We will explore these questions in our upcoming analysis. By doing so, we aim to gain deeper insights into the relationships between features. This will help us determine which interactions between variables are worth including in our model to improve model performance.
To get a feel for the dataset, first we look at shape of the target.
histogram
hist(df$imdb_score, main = "Distribution of IMDb Scores", xlab = "IMDb Score", col = "lightblue", breaks = 30)
boxplot
boxplot(df$imdb_score, main = "Boxplot of IMDb Scores", ylab = "IMDb Score", col = "lightblue")
The IMDb Scores are slightly left-skewed, but it’s not severe enough to be a major concern. This distribution makes sense because people usually rate good movies higher while reserving lower ratings for particularly bad experiences, which are less common. Since many viewers are more inclined to watch movies they expect to enjoy, this could lead to higher overall ratings.
For numerical variables, we first looking at their correlations and distribution.
Correlations with imdb_score
numerical_cols <- df[, c("imdb_score", "movie_budget", "release_day", "release_year", "duration",
"aspect_ratio", "nb_news_articles", "actor1_star_meter", "actor2_star_meter",
"actor3_star_meter", "nb_faces", "movie_meter_IMDBpro")]
cor_matrix <- cor(numerical_cols, use = "pairwise.complete.obs")
# sort on decreasing correlations with imdb_score
cor_sorted <- cor_matrix[order(-cor_matrix[, "imdb_score"]), order(-cor_matrix["imdb_score", ])]
# plot it
corrplot.mixed(cor_sorted, tl.col = "black", tl.pos = "lt")
Scatter plot matrix
options(scipen=999)
ggpairs(df[c("imdb_score", colnames(numerical_cols))],
upper = list(continuous = wrap("cor", size = 4)),
lower = list(continuous = wrap("points", alpha = 0.6)),
diag = list(continuous = "barDiag"),
progress = FALSE, message = FALSE, warnings = FALSE)
## Warning in warn_if_args_exist(list(...)): Extra arguments: 'message',
## 'warnings' are being ignored. If these are meant to be aesthetics, submit them
## using the 'mapping' variable within ggpairs with ggplot2::aes or
## ggplot2::aes_string.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
There is a moderate positive correlation between movie duration and IMDb score. This suggests that longer movies tend to have slightly higher ratings on average. Overall, the correlations shown above are quite low, no matter it’s between target or predictors themselves.
Note: Since Pearson’s correlation only captures linear relationships, non-linear interactions between certain variables and IMDb scores might exist.
For categorical variables, we first look at the dummy variables to see their distributions and correlations. Since some categorical variables in string format may require further conversion and can be more complex, we’ll address those in the later sections and explore them in more detail then.
Dummy variables - genres
There’re imbalances presenting in some predictors.
par(mfrow = c(5, 3), mar = c(2, 2, 2, 2))
genre_columns <- colnames(df)[grepl("action|adventure|scifi|thriller|musical|romance|western|sport|horror|drama|war|animation|crime", colnames(df))]
for (genre in genre_columns) {
genre_counts <- table(df[[genre]])
bar_heights <- barplot(genre_counts, main = paste("Distribution of", genre), col = "lightblue", ylim = c(0, max(genre_counts) * 1.1))
text(x = bar_heights, y = genre_counts / 2, labels = genre_counts, cex = 0.8, col = "black", pos = 3)
}
par(mfrow = c(1, 1), mar = c(5, 4, 4, 2) + 0.1)
Some genres show insignificant differences in terms of ratings, indicating that we might consider dropping them from our analysis.
par(mfrow = c(3, 5), mar = c(3, 3, 3, 3))
for (genre in genre_columns) {
boxplot(df$imdb_score ~ df[[genre]],
main = paste("IMDb Score by", genre),
ylab = "IMDb Score",
xlab = genre,
col = "lightblue",
las = 2) # las = 2 makes the axis labels perpendicular to the axis
}
par(mfrow = c(1, 1), mar = c(5, 4, 4, 2) + 0.1)
*** specifically suited for binary-continuous relationships, it measures the strength and direction of the association between a binary variable and a continuous variable.
Drama has the strongest positive correlation with IMDb score, suggesting drama movies tend to be rated higher than average.
for (genre in genre_columns) {
correlation <- biserial(df$imdb_score, df[[genre]])
print(paste(genre, ":", correlation))
}
##
## [1] "action : -0.227064322901513"
##
## [1] "adventure : -0.107108853641586"
##
## [1] "scifi : -0.156732577464518"
##
## [1] "thriller : -0.105581869620907"
##
## [1] "musical : -0.0429522776395504"
##
## [1] "romance : -0.020355293686273"
##
## [1] "western : 0.198305451004262"
##
## [1] "sport : 0.11759999228579"
##
## [1] "horror : -0.274239811445477"
##
## [1] "drama : 0.42494608034447"
##
## [1] "war : 0.255005038531814"
##
## [1] "animation : 0.0610537711481328"
##
## [1] "crime : 0.0862764275417247"
*** R uses NA for missing values, while Python uses NaN for missing values. If the dataset contains other non-standard missing values (““,”NaN”, “None”), R might not recognize them as NA.
colnames(df)[colSums(is.na(df)) > 0]
## character(0)
Language is the only feature having missing data.
for (col in colnames(df)){
na_checking = df[df[col] == "" | df[col] == "NaN" | df[col] == "None", ]
if (nrow(na_checking) > 0) {
print(col)
}
}
## [1] "language"
There are only two missing values in the language variable, and we’ll address that in the next part.
df[df$language == "" | df$language == "NaN" | df$language == "None", ]
## movie_id imdb_score movie_budget release_day release_month release_year
## 868 5032 8.5 4000000 1 Feb 2012
## 1604 12945 7.4 12500000 12 Feb 1982
## duration language country maturity_rating aspect_ratio
## 868 102 None USA PG-13 2.35
## 1604 100 None Canada R 2.35
## distributor nb_news_articles director
## 868 ICM Partners 112 Ron Fricke
## 1604 Films sans FrontiÌÄå¬res 31 Jean-Jacques Annaud
## actor1 actor1_star_meter actor2
## 868 Collin Alfredo St. Dic 1504888 Balinese Tari Legong Dancers
## 1604 Rae Dawn Chong 1153 Everett McGill
## actor2_star_meter actor3 actor3_star_meter colour_film
## 868 546238 Puti Sri Candra Dewi 582819 Color
## 1604 6876 Gary Schwartz 30057 Color
## genres nb_faces action adventure scifi thriller musical
## 868 Documentary|Music 0 0 0 0 0 1
## 1604 Adventure|Drama|History 0 0 1 0 0 0
## romance western sport horror drama war animation crime movie_meter_IMDBpro
## 868 0 0 0 0 0 0 0 0 9155
## 1604 0 0 0 0 1 0 0 0 6820
## cinematographer production_company
## 868 Ron Fricke Bali Film Center
## 1604 Claude Agostini International Cinema Corporation (ICC)
This section focuses on the essential task of cleaning and addressing any imperfections for later analysis, specifically for categorical predictors. We review all the features, addressing them one by one. This process includes handling missing values, converting variable types, encoding, and performing feature extractions.
Because the test dataset has no variation in this predictor (all movies are in English), it doesn’t provide useful information for predicting IMDb scores. So, we decide to drop it.
(skip the following analysis)
Since the two missing movies are from the North America region, it’s reasonable to impute their language as English.
df$language[df$language == "None"] <- "English"
nrow(df[df$language == "" | df$language == "NaN" | df$language == "None", ]) # 0
## [1] 0
bar plot
ggplot(df, aes(x = language)) +
geom_bar(fill = "lightblue") +
labs(title = "Distribution of Languages", x = "Language", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Since this column has an imbalanced distribution, with many granular languages that have little impact on the model, it would be better to group all non-English languages into a ‘Non-English’ category. This approach can simplify the model while still preserving important information about the language factor by keeping only ‘English’ and ‘Non-English’.
df$language[df$language != "English"] <- "non-English"
box plot
boxplot(imdb_score ~ language, data = df, main = "Boxplot of IMDb Scores by Language",
xlab = "Language Group", ylab = "IMDb Score", col = "lightblue")
table(df$language)
##
## English non-English
## 1894 36
The boxplot suggests a significant difference in IMDb scores between English and non-English movies. A pairwise t-test can formally determine whether this new classification is effective by assessing the impact of language on IMDb scores. If we reject the null hypothesis, it indicates that language could be a significant predictor of IMDb scores.
pairwise t-test
(Conservative ranking: Scheffé > Bonferroni > Tukey > Fisher’s LSD)
We choose a middle conservative t-test approach (Bonferroni), to assess the robustness of the newly reclassified levels.
pairwise_results <- pairwise.t.test(df$imdb_score, df$language, p.adjust.method = "bonferroni")
print(pairwise_results)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: df$imdb_score and df$language
##
## English
## non-English 0.00001
##
## P value adjustment method: bonferroni
This result indicates that there is a statistically significant difference in the mean IMDb scores between English and non-English movies, concluding that the language classification impacts the IMDb score.
Split the string format genres and remove duplicate ones with dummy.
Steps:
Splitting the genres and storing in a new column as a list
Identify unique genres from “Genres” after splitting
Filter out genres that aren’t represented as columns
Create new dummified columns for those extracted, unique genres
Assign 1 / 0 to these new dummy variables corresponding to extracted genres list in each movie
Splitting the genres and storing in a new column as a list
df$genre_list <- strsplit(as.character(df$genres), "\\|")
head(df$genre_list)
## [[1]]
## [1] "Drama"
##
## [[2]]
## [1] "Biography" "Drama" "Sport"
##
## [[3]]
## [1] "Drama" "Sport"
##
## [[4]]
## [1] "Horror" "Thriller"
##
## [[5]]
## [1] "Biography" "Crime" "Drama"
##
## [[6]]
## [1] "Comedy" "Romance"
Identify unique genres from “Genres” after splitting
unique_genres <- tolower(unique(unlist(df$genre_list)))
unique_genres
## [1] "drama" "biography" "sport" "horror" "thriller"
## [6] "crime" "comedy" "romance" "adventure" "sci-fi"
## [11] "action" "music" "fantasy" "history" "mystery"
## [16] "family" "war" "musical" "western" "animation"
## [21] "documentary"
Filter out genres that aren’t represented as columns
existing_genre_columns <- colnames(df)[colnames(df) %in% unique_genres]
existing_genre_columns
## [1] "action" "adventure" "thriller" "musical" "romance" "western"
## [7] "sport" "horror" "drama" "war" "animation" "crime"
Create new dummified columns for those extracted unique genres
&
Assign 1 / 0 to these new dummy variables corresponding to extracted genres list in each movie
for (genre in unique_genres) {
if (!(genre %in% existing_genre_columns)) {
df[[genre]] <- sapply(df$genre_list, function(x) ifelse(str_to_title(genre) %in% x, 1, 0))
}
}
df$genre_list <- NULL
head(df)
## movie_id imdb_score movie_budget release_day release_month release_year
## 1 2 7.3 25000000 10 Jan 2014
## 2 12 6.9 35000000 24 Oct 2003
## 3 15 7.2 30000000 14 Jan 2005
## 4 20 5.9 14000000 20 Aug 2012
## 5 22 7.6 8000000 22 Jun 1979
## 6 23 6.4 20000000 17 Mar 2006
## duration language country maturity_rating aspect_ratio
## 1 121 English USA R 2.35
## 2 109 English USA PG 1.85
## 3 136 English USA PG-13 2.35
## 4 92 English USA PG-13 2.35
## 5 112 English USA PG 1.85
## 6 105 English USA PG-13 1.85
## distributor nb_news_articles director
## 1 The Weinstein Company 2141 John Wells
## 2 Columbia Pictures Corporation 331 Michael Tollin
## 3 Paramount Pictures 223 Thomas Carter
## 4 Lionsgate 620 Ole Bornedal
## 5 Paramount Pictures 97 Don Siegel
## 6 Lakeshore International 173 Andy Fickman
## actor1 actor1_star_meter actor2
## 1 Benedict Cumberbatch 259 Meryl Streep
## 2 Alfre Woodard 2735 Riley Smith
## 3 Channing Tatum 573 Rick Gonzalez
## 4 Kyra Sedgwick 2047 Madison Davenport
## 5 Clint Eastwood 102 Patrick McGoohan
## 6 Channing Tatum 573 Alexandra Breckenridge
## actor2_star_meter actor3 actor3_star_meter colour_film
## 1 559 Julia Roberts 513 Color
## 2 3915 Debra Winger 1845 Color
## 3 4793 Robert Ri'chard 6729 Color
## 4 1769 Natasha Calis 11963 Color
## 5 5062 Fred Ward 5451 Color
## 6 370 Laura Ramsey 3711 Color
## genres nb_faces action adventure scifi thriller musical
## 1 Drama 3 0 0 0 0 0
## 2 Biography|Drama|Sport 1 0 0 0 0 0
## 3 Drama|Sport 0 0 0 0 0 0
## 4 Horror|Thriller 0 0 0 0 1 0
## 5 Biography|Crime|Drama 0 0 0 0 0 0
## 6 Comedy|Romance 0 0 0 0 0 0
## romance western sport horror drama war animation crime movie_meter_IMDBpro
## 1 0 0 0 0 1 0 0 0 4000
## 2 0 0 1 0 1 0 0 0 8556
## 3 0 0 1 0 1 0 0 0 3940
## 4 0 0 0 1 0 0 0 0 5452
## 5 0 0 0 0 1 0 0 1 4722
## 6 1 0 0 0 0 0 0 0 2446
## cinematographer production_company biography comedy sci-fi music fantasy
## 1 Adriano Goldman The Weinstein Company 0 0 0 0 0
## 2 Don Burgess Revolution Studios 1 0 0 0 0
## 3 Sharone Meir Coach Carter 0 0 0 0 0
## 4 Dan Laustsen Ghost House Pictures 0 0 0 0 0
## 5 Bruce Surtees Paramount Pictures 1 0 0 0 0
## 6 Greg Gardiner DreamWorks 0 1 0 0 0
## history mystery family documentary
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
dim(df)
## [1] 1930 48
colnames(df)
## [1] "movie_id" "imdb_score" "movie_budget"
## [4] "release_day" "release_month" "release_year"
## [7] "duration" "language" "country"
## [10] "maturity_rating" "aspect_ratio" "distributor"
## [13] "nb_news_articles" "director" "actor1"
## [16] "actor1_star_meter" "actor2" "actor2_star_meter"
## [19] "actor3" "actor3_star_meter" "colour_film"
## [22] "genres" "nb_faces" "action"
## [25] "adventure" "scifi" "thriller"
## [28] "musical" "romance" "western"
## [31] "sport" "horror" "drama"
## [34] "war" "animation" "crime"
## [37] "movie_meter_IMDBpro" "cinematographer" "production_company"
## [40] "biography" "comedy" "sci-fi"
## [43] "music" "fantasy" "history"
## [46] "mystery" "family" "documentary"
# extracted genres: "biography", "comedy", "music", "fantasy", "history", "mystery", "family", "documentary"
Transforming them into a tidy data frame makes visual analysis easier.
genres_all <- c("action", "adventure", "scifi", "thriller", "musical", "romance",
"western", "sport", "horror", "drama", "war", "animation",
"crime", "biography", "comedy", "sci-fi", "music", "fantasy",
"history", "mystery", "family", "documentary")
df_long <- pivot_longer(df, cols = all_of(genres_all), names_to = "genre", values_to = "genre_indicator")
df_long <- df_long[df_long$genre_indicator == 1, ]
head(df_long)
## # A tibble: 6 × 28
## movie_id imdb_score movie_budget release_day release_month release_year
## <int> <dbl> <int> <int> <chr> <int>
## 1 2 7.3 25000000 10 Jan 2014
## 2 12 6.9 35000000 24 Oct 2003
## 3 12 6.9 35000000 24 Oct 2003
## 4 12 6.9 35000000 24 Oct 2003
## 5 15 7.2 30000000 14 Jan 2005
## 6 15 7.2 30000000 14 Jan 2005
## # ℹ 22 more variables: duration <int>, language <chr>, country <chr>,
## # maturity_rating <chr>, aspect_ratio <dbl>, distributor <chr>,
## # nb_news_articles <int>, director <chr>, actor1 <chr>,
## # actor1_star_meter <int>, actor2 <chr>, actor2_star_meter <int>,
## # actor3 <chr>, actor3_star_meter <int>, colour_film <chr>, genres <chr>,
## # nb_faces <int>, movie_meter_IMDBpro <int>, cinematographer <chr>,
## # production_company <chr>, genre <chr>, genre_indicator <dbl>
Box plot
ggplot(df_long, aes(x = genre, y = imdb_score)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Box Plot of IMDb Scores by Genre",
x = "Genre",
y = "IMDb Score") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
As the “animation”, “documentary” are less than 30, they could be group into “other”.
table(df_long$genre)
##
## action adventure animation biography comedy crime
## 387 244 20 152 766 417
## documentary drama family fantasy history horror
## 11 1060 146 172 76 218
## music musical mystery romance sci-fi scifi
## 100 136 203 473 209 209
## sport thriller war western
## 93 575 70 34
Look at the t-test to further filter out genres types that could be grouped together!
for (genre in genres_all) {
# Welch's t-test
t_test_results <- t.test(df$imdb_score[df[[genre]] == 1], df$imdb_score[df[[genre]] == 0])
# Fetch and print insignificant genres (threshold: p-value = 0.05)
if (t_test_results$p.value >= 0.05) {
print(paste(genre, "- p-value:", t_test_results$p.value))
}
}
## [1] "musical - p-value: 0.337662947281095"
## [1] "romance - p-value: 0.475827407419031"
## [1] "animation - p-value: 0.420847318364638"
## [1] "music - p-value: 0.0939739158062037"
## [1] "mystery - p-value: 0.863901049829263"
Group those undesired genres into “other_genres”
grouped_genres <- c("animation", "documentary", "musical", "romance", "music", "mystery")
df$other_genre <- 0
for (col in grouped_genres) {
df$other_genre[df[[col]] == 1] <- 1
}
drop_genres_cols <- c("genres", "animation", "documentary", "musical", "romance", "music", "mystery")
df[drop_genres_cols] <- NULL
colnames(df)
## [1] "movie_id" "imdb_score" "movie_budget"
## [4] "release_day" "release_month" "release_year"
## [7] "duration" "language" "country"
## [10] "maturity_rating" "aspect_ratio" "distributor"
## [13] "nb_news_articles" "director" "actor1"
## [16] "actor1_star_meter" "actor2" "actor2_star_meter"
## [19] "actor3" "actor3_star_meter" "colour_film"
## [22] "nb_faces" "action" "adventure"
## [25] "scifi" "thriller" "western"
## [28] "sport" "horror" "drama"
## [31] "war" "crime" "movie_meter_IMDBpro"
## [34] "cinematographer" "production_company" "biography"
## [37] "comedy" "sci-fi" "fantasy"
## [40] "history" "family" "other_genre"
Because the test dataset has no variation in this predictor (all movies are in colour), it doesn’t provide useful information for predicting IMDb scores. So, we decide to drop it.
(skip the following analysis)
There is a significant difference between the levels. Although the movies we are going to predict are all in color, making it reasonable to drop this variable, for now, we’ll keep it to check its interaction effect with other predictors.
ggplot(df, aes(x = colour_film, y = imdb_score)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Box Plot of IMDb Scores by colour_film",
x = "colour_film",
y = "IMDb Score") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(df, aes(x = maturity_rating, y = imdb_score)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Box Plot of IMDb Scores by maturity_rating",
x = "maturity_rating",
y = "IMDb Score") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Given the presence of rare levels in this feature, we reclassify them into three categories based on the age definitions mentioned earlier to help with dimension reduction and enhance the robustness of the levels.
table(df$maturity_rating)
##
## Approved G GP M NC-17 Passed PG PG-13
## 21 34 2 2 3 4 255 582
## R TV-14 TV-G X
## 1013 3 3 8
Ordinal Levels -
Based on the usual rating hierarchy, the sorted ordinal levels for maturity ratings could be:
= 17/18: R, NC-17, M, Passed, X
group_G <- c("Approved", "TV-G", "GP")
group_PG <- c("PG-13", "TV-14")
group_R <- c("NC-17", "M", "Passed", "X")
df$maturity_rating[df$maturity_rating %in% group_G] <- "G"
df$maturity_rating[df$maturity_rating %in% group_PG] <- "PG"
df$maturity_rating[df$maturity_rating %in% group_R] <- "R"
table(df$maturity_rating)
##
## G PG R
## 60 840 1030
G and R are quite similar in terms of imdb_score.
ggplot(df, aes(x = maturity_rating, y = imdb_score)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Box Plot of IMDb Scores by maturity_rating",
x = "maturity_rating",
y = "IMDb Score") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
t-test
pairwise_results <- pairwise.t.test(df$imdb_score, df$maturity_rating, p.adjust.method = "bonferroni")
print(pairwise_results)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: df$imdb_score and df$maturity_rating
##
## G PG
## PG 0.00019 -
## R 0.41722 0.0000000000012
##
## P value adjustment method: bonferroni
There is no significant difference in IMDb scores between the R and G ratings. However, combining them into a single group seems inappropriate, and there may be potential interaction effects with other predictors. So, we decide to keep all three levels.
table(df$country)
##
## Aruba Australia Belgium Brazil Canada
## 1 23 1 1 38
## China Colombia Czech Republic Denmark France
## 2 1 1 1 40
## Georgia Germany Greece Hong Kong Hungary
## 1 34 1 4 1
## India Indonesia Ireland Italy Japan
## 1 1 5 8 5
## Kyrgyzstan Mexico Netherlands New Zealand Official site
## 1 1 2 5 1
## Peru Russia South Africa South Korea Spain
## 1 1 5 2 7
## Taiwan UK USA West Germany
## 1 177 1555 1
We have 34 countries in the dataset, but some only have a single sample, making them quite rare. We approach it in two ways:
When modeling, we’ll evaluate which approach performs better.
Approach 1: Re-classify countries by continent
Africa <- c("South Africa", "Kenya", "Nigeria", "Egypt", "Ghana", "Morocco", "Uganda")
Asia <- c("China", "India", "Japan", "South Korea", "Indonesia", "Kyrgyzstan", "Russia", "Taiwan", "Hong Kong")
Europe <- c("Belgium", "Czech Republic", "Denmark", "France", "Georgia", "Germany", "Greece", "Hungary", "Ireland", "Italy", "Netherlands", "Spain", "UK", "West Germany")
North_America <- c("USA", "Canada")
Central_South_America <- c("Mexico", "Aruba", "Brazil", "Colombia", "Peru")
Australia <- c("Australia", "New Zealand")
Other <- c("Official site")
df$continent <- 0
df$continent[df$country %in% Africa] <- "Africa"
df$continent[df$country %in% Asia] <- "Asia"
df$continent[df$country %in% Europe] <- "Europe"
df$continent[df$country %in% North_America] <- "North America"
df$continent[df$country %in% Central_South_America] <- "Central South America"
df$continent[df$country %in% Australia] <- "Australia"
df$continent[df$country %in% Other] <- "Other"
table(df$continent)
##
## Africa Asia Australia
## 5 18 28
## Central South America Europe North America
## 5 280 1593
## Other
## 1
Since Africa, Asia, Australia, Central South America, Other each have fewer than 30 samples, which limits their statistical representation, we further consolidate these continents into “other_continents” category.
other_continent <- c("Africa", "Asia", "Australia", "Central South America", "Other")
df$continent[df$continent %in% other_continent] <- "other_continent"
table(df$continent)
##
## Europe North America other_continent
## 280 1593 57
ggplot(df, aes(x = continent, y = imdb_score)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Box Plot of IMDb Scores by continent",
x = "continent",
y = "IMDb Score") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
t-test
pairwise_results <- pairwise.t.test(df$imdb_score, df$continent, p.adjust.method = "bonferroni")
print(pairwise_results)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: df$imdb_score and df$continent
##
## Europe North America
## North America 0.00000014 -
## other_continent 0.51 0.74
##
## P value adjustment method: bonferroni
Although the t-test shows no significant difference in IMDb scores between ‘other_continent’ and Europe or between ‘other_continent’ and North America, keeping ‘other_continent’ as a separate category still provides additional granularity and may help capture nuanced interactions with other predictors. We’ll see.
Approach 2: Re-classify countries by frequncy
over_30_countries <- c("Germany", "West Germany", "Canada", "France", "UK", "USA")
over_30_germany <- c("West Germany")
df$country[!(df$country %in% over_30_countries)] <- "other_countries"
df$country[df$country %in% over_30_germany] <- "Germany"
table(df$country)
##
## Canada France Germany other_countries UK
## 38 40 35 85 177
## USA
## 1555
Box-plot
ggplot(df, aes(x = country, y = imdb_score)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Box Plot of IMDb Scores by country",
x = "country",
y = "IMDb Score") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
t-test
pairwise_results <- pairwise.t.test(df$imdb_score, df$country, p.adjust.method = "bonferroni")
print(pairwise_results)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: df$imdb_score and df$country
##
## Canada France Germany other_countries UK
## France 0.293 - - - -
## Germany 0.782 1.000 - - -
## other_countries 0.023 1.000 1.000 - -
## UK 0.00002257 0.855 0.416 0.959 -
## USA 0.201 1.000 1.000 0.860 0.00000015
##
## P value adjustment method: bonferroni
Despite the lack of strong significance in most comparisons, keeping the ‘other_countries’ category allows for finer granularity and could help capture nuanced interactions with other predictors. Who knows what might happen when we run some non-linear or non-parametric models!!!
The way we approach personnel-related predictors is as follows:
Steps:
Identify the top 10 entries for each predictor
personnel_cols <- c("director", "actor1", "actor2", "actor3", "cinematographer", "production_company", "distributor")
top_10_list <- list()
for (col in personnel_cols) {
top_10 <- head(sort(table(df[[col]]), decreasing = TRUE), 10)
# Store the top 10 in the list with the column name as the key
top_10_list[[col]] <- top_10
cat("Top 10 for", col, ":\n")
print(top_10)
cat("-------------------------\n")
sum_top_10 <- sum(top_10)
cat("Sum of Top 10:", sum_top_10, "\n")
cat("-------------------------\n")
}
## Top 10 for director :
##
## Woody Allen Steven Spielberg Clint Eastwood
## 18 12 11
## Spike Lee Steven Soderbergh Martin Scorsese
## 11 10 9
## Barry Levinson Bobby Farrelly Francis Ford Coppola
## 8 8 8
## Joel Schumacher
## 8
## -------------------------
## Sum of Top 10: 103
## -------------------------
## Top 10 for actor1 :
##
## Robert De Niro Bill Murray J.K. Simmons Kevin Spacey
## 30 17 17 17
## Jason Statham Harrison Ford Johnny Depp Denzel Washington
## 15 14 14 13
## Scarlett Johansson Keanu Reeves
## 13 12
## -------------------------
## Sum of Top 10: 162
## -------------------------
## Top 10 for actor2 :
##
## Morgan Freeman Charlize Theron Brad Pitt Chazz Palminteri
## 9 7 6 6
## Demi Moore Meryl Streep James Franco Jason Flemyng
## 6 6 5 5
## Jay Hernandez Kate Winslet
## 5 5
## -------------------------
## Sum of Top 10: 60
## -------------------------
## Top 10 for actor3 :
##
## Hope Davis Ben Mendelsohn John Heard Robert Duvall
## 6 5 5 5
## Steve Carell Thomas Lennon Anne Heche Bob Gunton
## 5 5 4 4
## Bruce McGill Clifton Collins Jr.
## 4 4
## -------------------------
## Sum of Top 10: 47
## -------------------------
## Top 10 for cinematographer :
##
## multiple Roger Deakins Mark Irwin John Bailey
## 79 18 17 16
## Andrew Dunn Jack N. Green Matthew F. Leonetti Robert Elswit
## 13 13 13 13
## Dean Cundey Don Burgess
## 12 12
## -------------------------
## Sum of Top 10: 206
## -------------------------
## Top 10 for production_company :
##
## Universal Pictures Paramount Pictures
## 110 99
## Columbia Pictures Corporation Warner Bros.
## 96 76
## New Line Cinema Twentieth Century Fox
## 75 70
## Metro-Goldwyn-Mayer (MGM) Touchstone Pictures
## 38 31
## DreamWorks Miramax
## 29 29
## -------------------------
## Sum of Top 10: 653
## -------------------------
## Top 10 for distributor :
##
## Warner Bros. Universal Pictures
## 169 146
## Paramount Pictures Twentieth Century Fox
## 138 126
## Columbia Pictures Corporation New Line Cinema
## 113 73
## Buena Vista Pictures Miramax
## 60 44
## United Artists Metro-Goldwyn-Mayer (MGM)
## 40 39
## -------------------------
## Sum of Top 10: 948
## -------------------------
Create new dummy columns indicating whether a movie falls within the top 10 for each personnel predictor
for (col in personnel_cols) {
top_10_names <- names(top_10_list[[col]])
new_col_name <- paste0("top_", col)
df[[new_col_name]] <- ifelse(df[[col]] %in% top_10_names, 1, 0)
}
df[personnel_cols] <- NULL
colnames(df)
## [1] "movie_id" "imdb_score" "movie_budget"
## [4] "release_day" "release_month" "release_year"
## [7] "duration" "language" "country"
## [10] "maturity_rating" "aspect_ratio" "nb_news_articles"
## [13] "actor1_star_meter" "actor2_star_meter" "actor3_star_meter"
## [16] "colour_film" "nb_faces" "action"
## [19] "adventure" "scifi" "thriller"
## [22] "western" "sport" "horror"
## [25] "drama" "war" "crime"
## [28] "movie_meter_IMDBpro" "biography" "comedy"
## [31] "sci-fi" "fantasy" "history"
## [34] "family" "other_genre" "continent"
## [37] "top_director" "top_actor1" "top_actor2"
## [40] "top_actor3" "top_cinematographer" "top_production_company"
## [43] "top_distributor"
head(df)
## movie_id imdb_score movie_budget release_day release_month release_year
## 1 2 7.3 25000000 10 Jan 2014
## 2 12 6.9 35000000 24 Oct 2003
## 3 15 7.2 30000000 14 Jan 2005
## 4 20 5.9 14000000 20 Aug 2012
## 5 22 7.6 8000000 22 Jun 1979
## 6 23 6.4 20000000 17 Mar 2006
## duration language country maturity_rating aspect_ratio nb_news_articles
## 1 121 English USA R 2.35 2141
## 2 109 English USA PG 1.85 331
## 3 136 English USA PG 2.35 223
## 4 92 English USA PG 2.35 620
## 5 112 English USA PG 1.85 97
## 6 105 English USA PG 1.85 173
## actor1_star_meter actor2_star_meter actor3_star_meter colour_film nb_faces
## 1 259 559 513 Color 3
## 2 2735 3915 1845 Color 1
## 3 573 4793 6729 Color 0
## 4 2047 1769 11963 Color 0
## 5 102 5062 5451 Color 0
## 6 573 370 3711 Color 0
## action adventure scifi thriller western sport horror drama war crime
## 1 0 0 0 0 0 0 0 1 0 0
## 2 0 0 0 0 0 1 0 1 0 0
## 3 0 0 0 0 0 1 0 1 0 0
## 4 0 0 0 1 0 0 1 0 0 0
## 5 0 0 0 0 0 0 0 1 0 1
## 6 0 0 0 0 0 0 0 0 0 0
## movie_meter_IMDBpro biography comedy sci-fi fantasy history family
## 1 4000 0 0 0 0 0 0
## 2 8556 1 0 0 0 0 0
## 3 3940 0 0 0 0 0 0
## 4 5452 0 0 0 0 0 0
## 5 4722 1 0 0 0 0 0
## 6 2446 0 1 0 0 0 0
## other_genre continent top_director top_actor1 top_actor2 top_actor3
## 1 0 North America 0 0 1 0
## 2 0 North America 0 0 0 0
## 3 0 North America 0 0 0 0
## 4 0 North America 0 0 0 0
## 5 0 North America 0 0 0 0
## 6 1 North America 0 0 0 0
## top_cinematographer top_production_company top_distributor
## 1 0 0 0
## 2 1 0 1
## 3 0 0 1
## 4 0 0 0
## 5 0 1 1
## 6 0 1 0
Even though the upcoming movies we’re predicting are all released in November, and the timing predictors might seem irrelevant, they could still be valuable. If historical data reveals significant trends or patterns associated with specific days of the week, release periods within a month (early, mid, or late month), or whether it’s a weekday or weekend, these predictors could help improve the model’s performance.
bar chart (Distribution)
We can classify the day into three groups representing early, mid, and late parts of a month to see whether there’s a trend.
# convert the chr to factor, so that the plot will present in order
df$release_month <- factor(df$release_month, levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
df$release_day_group <- cut(as.numeric(df$release_day),
breaks = c(0, 10, 20, 31),
labels = c("Early Month", "Mid Month", "Late Month"),
right = TRUE)
release_cols <- colnames(df)[grepl("release_day|release_day_group|release_month|release_year", colnames(df))]
for (col in release_cols) {
print(col)
print(table(df[[col]]))
barplot(table(df[[col]]), main=paste("Distribution of", col), col="lightblue")
}
## [1] "release_day"
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 78 54 50 48 54 48 52 49 55 73 63 83 61 63 66 58 68 62 83 87
## 21 22 23 24 25 26 27 28 29 30
## 81 84 70 65 110 61 55 55 50 44
## [1] "release_month"
##
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 205 155 157 169 94 147 149 172 187 216 148 131
## [1] "release_year"
##
## 1936 1938 1939 1940 1941 1943 1944 1947 1951 1954 1955 1957 1958 1960 1961 1962
## 1 1 1 1 1 1 1 2 2 2 1 1 1 2 1 2
## 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978
## 3 4 3 1 2 4 4 3 7 3 8 3 2 5 9 7
## 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994
## 10 14 14 21 14 18 21 18 19 19 21 17 18 17 25 33
## 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
## 36 47 45 51 66 70 80 83 75 63 72 93 84 91 81 76
## 2011 2012 2013 2014 2015 2016 2017 2018
## 94 80 74 74 69 35 2 1
## [1] "release_day_group"
##
## Early Month Mid Month Late Month
## 561 694 675
box plot
for (col in release_cols) {
boxplot(df$imdb_score ~ df[[col]], main=paste("IMDb Score by", col), ylab="IMDb Score", xlab=col, col = "lightblue")
}
scatter plot
for (col in release_cols) {
p <- ggplot(df, aes_string(x = col, y = "imdb_score")) +
geom_point(color = "black", alpha = 0.6) +
geom_smooth(method = "loess", color = "red", se = TRUE) +
labs(x = col, y = "IMDb Rating", title = paste("IMDb Rating Trends by", col)) +
theme_minimal() # Cleaner theme
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
From the analysis above, it seems like time-related predictors are useless. The decreasing trend observed in the year dimension is likely due to sample size rather than an actual decline in IMDb scores. Also, no discernible pattern spotted, regardless of the time granularity analyzed.
Mon ~ Sun / Weekday or Weekend
One more thing we can take with them is to analyze whether the release day falls on a weekday / weekend, or on a specific day of the week, has any effect on the IMDb scores.
df$date <- as.Date(with(df, paste(release_year, release_month, release_day, sep = "-")), "%Y-%b-%d")
# Get the day of the week (1 = Sunday, 7 = Saturday)
df$day_of_week <- wday(df$date, label = TRUE)
df$weekday_or_weekend <- ifelse(df$day_of_week %in% c("Sat", "Sun"), "Weekend", "Weekday")
head(df[c("release_year", "release_month", "release_day", "day_of_week", "weekday_or_weekend")])
## release_year release_month release_day day_of_week weekday_or_weekend
## 1 2014 Jan 10 Fri Weekday
## 2 2003 Oct 24 Fri Weekday
## 3 2005 Jan 14 Fri Weekday
## 4 2012 Aug 20 Mon Weekday
## 5 1979 Jun 22 Fri Weekday
## 6 2006 Mar 17 Fri Weekday
box plots
for (col in c("day_of_week", "weekday_or_weekend", "release_month", "release_day_group")) {
print(table(df[col]))
boxplot(df$imdb_score ~ df[[col]], main=paste("IMDb Score by", col), ylab="IMDb Score", xlab=col, col = "lightblue")
}
## day_of_week
## Sun Mon Tue Wed Thu Fri Sat
## 38 24 55 179 67 1523 44
## weekday_or_weekend
## Weekday Weekend
## 1848 82
## release_month
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 205 155 157 169 94 147 149 172 187 216 148 131
## release_day_group
## Early Month Mid Month Late Month
## 561 694 675
pairwise t-test
Since the movies we’re going to predict are all being released in November, it makes more sense to focus on time-related predictors at a granularity of month or smaller. Given that the day unit is too small, we’ve decided to just focus on two groups: ‘weekend / weekday,’ and ‘early / mid / late month.’
time_cols <- c("day_of_week", "weekday_or_weekend", "release_day_group")
for (col in time_cols) {
pairwise_results <- pairwise.t.test(df$imdb_score, df[[col]], p.adjust.method = "bonferroni")
print(pairwise_results)
}
##
## Pairwise comparisons using t tests with pooled SD
##
## data: df$imdb_score and df[[col]]
##
## Sun Mon Tue Wed Thu Fri
## Mon 1.0000 - - - - -
## Tue 1.0000 1.0000 - - - -
## Wed 1.0000 1.0000 0.2871 - - -
## Thu 1.0000 1.0000 1.0000 1.0000 - -
## Fri 1.0000 1.0000 1.0000 0.0013 1.0000 -
## Sat 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
##
## P value adjustment method: bonferroni
##
## Pairwise comparisons using t tests with pooled SD
##
## data: df$imdb_score and df[[col]]
##
## Weekday
## Weekend 0.25
##
## P value adjustment method: bonferroni
##
## Pairwise comparisons using t tests with pooled SD
##
## data: df$imdb_score and df[[col]]
##
## Early Month Mid Month
## Mid Month 1 -
## Late Month 1 1
##
## P value adjustment method: bonferroni
Although there is no significant difference in IMDb scores between levels within each group, this interpretation relies solely on linear perspectives. In a similar fashion to previous predictors, retaining these variables could still be valuable for capturing potential interactions with other predictors. Also, they can enhance visual analysis when exploring trend patterns.
Drop useless columns
drop_release_cols <- c("date", "release_year", "release_month", "release_day")
df[drop_release_cols] <- NULL
colnames(df)
## [1] "movie_id" "imdb_score" "movie_budget"
## [4] "duration" "language" "country"
## [7] "maturity_rating" "aspect_ratio" "nb_news_articles"
## [10] "actor1_star_meter" "actor2_star_meter" "actor3_star_meter"
## [13] "colour_film" "nb_faces" "action"
## [16] "adventure" "scifi" "thriller"
## [19] "western" "sport" "horror"
## [22] "drama" "war" "crime"
## [25] "movie_meter_IMDBpro" "biography" "comedy"
## [28] "sci-fi" "fantasy" "history"
## [31] "family" "other_genre" "continent"
## [34] "top_director" "top_actor1" "top_actor2"
## [37] "top_actor3" "top_cinematographer" "top_production_company"
## [40] "top_distributor" "release_day_group" "day_of_week"
## [43] "weekday_or_weekend"
Moving on to the numerical variables. Similar to the approach taken with categorical predictors, we conduct outlier detection, skewness, scaling and normalization, feature engineering, or data type conversion.
Histograms
num_cols <- c("movie_budget", "nb_faces", "nb_news_articles", "movie_meter_IMDBpro", "actor1_star_meter", "actor2_star_meter", "actor3_star_meter")
options(scipen = 999)
par(mfrow = c(2, 4), mar = c(4, 4, 2, 1))
for (col in num_cols) {
hist(df[[col]],
main = paste("Distribution of ", col),
xlab = col,
col = "lightblue",
breaks = 30,
ylim = c(0, max(table(cut(df[[col]], breaks = 30)), na.rm = TRUE) * 1.1) # Adjust ylim to add some padding
)
}
par(mfrow = c(1, 1), mar = c(5, 4, 4, 2) + 0.1)
Box Plots
par(mfrow = c(2, 4), mar = c(4, 5, 2, 1), oma = c(0, 0, 0, 0))
for (col in num_cols) {
boxplot(df[[col]],
main = paste("Boxplot of", col),
col = "lightblue",
las = 2,
ylim = range(df[[col]], na.rm = TRUE) + c(-1, 1) * diff(range(df[[col]], na.rm = TRUE)) * 0.1
)
}
par(mfrow = c(1, 1), mar = c(5, 4, 4, 2) + 0.1)
Except for movie_budget, all predictors show a significant right skew due to outliers. We thus remove those fall outside of 3 IQR from the dataset.
cleaned_df <- df
par(mfrow = c(2, 4), mar = c(4, 5, 2, 1), oma = c(0, 0, 0, 0))
for (col in num_cols) {
Q1 <- quantile(cleaned_df[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(cleaned_df[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 3 * IQR
upper_bound <- Q3 + 3 * IQR
cleaned_df <- cleaned_df[cleaned_df[[col]] >= lower_bound & cleaned_df[[col]] <= upper_bound, ]
hist(cleaned_df[[col]],
main = paste("Distribution of", col),
xlab = col,
col = "lightblue",
breaks = 30,
ylim = c(0, max(table(cut(cleaned_df[[col]], breaks = 30)), na.rm = TRUE) * 1.1)
)
}
par(mfrow = c(1, 1), mar = c(5, 4, 4, 2) + 0.1)
print(dim(cleaned_df)) # dropped rows = 479, drop rate = 25%
## [1] 1451 43
log transformation
Since after removing 3 IQR, predictor are still right skewed, we decide to use log transformation to reduce skewness.
for (col in num_cols) {
# use log (x + 1) since there're 0 in some predictors
cleaned_df[[paste0(col, "_log")]] <- log(cleaned_df[[col]] + 1)
}
histogram
log_num_cols <- paste0(num_cols, "_log")
par(mfrow = c(2, 4), mar = c(4, 5, 2, 1), oma = c(0, 0, 0, 0))
for (col in log_num_cols) {
hist(cleaned_df[[col]],
main = paste("Distribution of", col),
xlab = col,
col = "lightblue",
breaks = 30,
ylim = c(0, max(table(cut(cleaned_df[[col]], breaks = 30)), na.rm = TRUE) * 1.1) # Adjust ylim to add padding
)
}
par(mfrow = c(1, 1), mar = c(5, 4, 4, 2) + 0.1)
While distributions of some predictors are left skewed, but they appear to be relatively symmetric than extremely skewed originally after transformation.
Note:
For the actor star meter variables, since they all show similar patterns, we might consider creating a single feature that combines this information to reduce dimensionality (could be like taking the average log star meter of the top 3 actors)
For those left-skewed distributions after log transformation, we might consider:
Scatter Plots & Correlation matrix
options(scipen=999)
ggpairs(cleaned_df[c("imdb_score", num_cols)],
upper = list(continuous = wrap("cor", size = 4)),
lower = list(continuous = wrap("points", alpha = 0.6)),
diag = list(continuous = "barDiag"),
progress = FALSE, message = FALSE, warnings = FALSE)
## Warning in warn_if_args_exist(list(...)): Extra arguments: 'message',
## 'warnings' are being ignored. If these are meant to be aesthetics, submit them
## using the 'mapping' variable within ggpairs with ggplot2::aes or
## ggplot2::aes_string.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Except for the moderate negative correlation between the number of news articles and IMDbPro movie meter rankings, suggesting that movies with more media coverage tend to have better (lower) IMDbPro rankings, many of the other correlations are relatively weak. We could further explore potential complex interactions between variables through cross-analysis.
Target variable
plot_list <- list()
for (time in time_cols) {
df_summary <- cleaned_df %>%
group_by(.data[[time]]) %>%
summarise(mean_value = mean(.data[["imdb_score"]], na.rm = TRUE))
p <- ggplot(df_summary, aes_string(x = time, y = "mean_value", group = 1)) +
geom_line(color = "lightblue", size = 1) +
geom_point(color = "black", size = 2) +
labs(title = "imdb_score",
x = time,
y = "Average imdb_score") +
theme_minimal() +
theme(
plot.title = element_text(size = 8),
axis.title.y = element_text(size = 7),
axis.title.x = element_text(size = 7),
axis.text.x = element_text(size = 7)
)
plot_list <- c(plot_list, list(p))
}
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
grid.arrange(grobs = plot_list, ncol = 3, nrow = 1)
Movies released on weekends or mid-month tend to have higher rating scores, but their effect seems moderate.
Numerical predictors
plot_list <- list()
for (col in num_cols) {
for (time in time_cols) {
# Group and calculate the mean of each predictor
df_summary <- cleaned_df %>%
group_by(.data[[time]]) %>%
summarise(mean_value = mean(.data[[col]], na.rm = TRUE))
p <- ggplot(df_summary, aes_string(x = time, y = "mean_value", group = 1)) +
geom_line(color = "lightblue", size = 1) +
geom_point(color = "black", size = 2) +
labs(title = col,
x = time,
y = paste("Average", col)) +
theme_minimal() +
theme(
plot.title = element_text(size = 8),
axis.title.y = element_text(size = 6),
axis.title.x = element_text(size = 5),
axis.text.x = element_text(size = 5)
)
plot_list <- c(plot_list, list(p))
}
}
grid.arrange(grobs = plot_list, ncol = 6, nrow = 4)
From the trend plots, it indicates that movies released during different timeframes may have varying effects on the predictors.
What predictors have high correlation with IMDb scores?
# Ensure factors use the default (simple) contrast setting
options(contrasts = c("contr.treatment", "contr.treatment")) # Use simple dummy coding for unordered factors
factor_columns <- c("maturity_rating", "country", "continent", "release_day_group", "day_of_week", "weekday_or_weekend")
# Set factors and ensure the correct reference levels for the specified columns
cleaned_df$maturity_rating <- relevel(as.factor(cleaned_df$maturity_rating), ref = "G")
cleaned_df$country <- relevel(as.factor(cleaned_df$country), ref = "other_countries")
cleaned_df$continent <- relevel(as.factor(cleaned_df$continent), ref = "other_continent")
cleaned_df$release_day_group <- relevel(as.factor(cleaned_df$release_day_group), ref = "Mid Month")
cleaned_df$day_of_week <- factor(cleaned_df$day_of_week, ordered = FALSE)
cleaned_df$day_of_week <- relevel(cleaned_df$day_of_week, ref = "Mon")# Set "Mon" as the reference level
cleaned_df$weekday_or_weekend <- relevel(as.factor(cleaned_df$weekday_or_weekend), ref = "Weekday")
# Now create the dummy variables after releveling
dummy <- dummyVars(~ ., data = cleaned_df[, factor_columns], fullRank = TRUE) # fullRank = TRUE excludes the base level
dummy_data <- predict(dummy, newdata = cleaned_df)
dummy_data <- as.data.frame(dummy_data)
# Combine the numeric columns with the new dummy data
numeric_cols <- cleaned_df[, !names(cleaned_df) %in% c("movie_id", "colour_film", "language", "maturity_rating", "country", "continent", "release_day_group", "day_of_week", "weekday_or_weekend", "other_genre")]
df_dummies <- cbind(numeric_cols, dummy_data)
correlation_matrix <- cor(df_dummies)
melted_correlation_matrix <- melt(correlation_matrix)
ggplot(melted_correlation_matrix, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") + # Color of the grid
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name="Correlation") +
theme_minimal() + # Use a minimal theme
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 8, hjust = 1)) +
theme(axis.text.y = element_text(size = 8)) +
coord_fixed() + # Fix the aspect ratio
labs(title = "Heatmap of correlations", x = "Variables", y = "Variables")
cor_with_target <- correlation_matrix[, "imdb_score"]
cor_with_target_sorted <- sort(cor_with_target, decreasing = TRUE)
head(cor_with_target_sorted, 10)
## imdb_score duration drama
## 1.0000000 0.4330157 0.3757671
## nb_news_articles nb_news_articles_log biography
## 0.3086500 0.3037897 0.2020239
## maturity_rating.R top_director history
## 0.1679921 0.1532778 0.1500564
## top_actor1
## 0.1411856
Duration, drama genre, and the number of news articles are the top three predictors with the highest correlation to IMDb scores, indicating they are likely to play a crucial role in the predictive model.
Does the release timing (day of the week, weekday or weekend, part of the month) affect IMDb scores?
Movies released on weekends or mid-month tend to have higher rating scores, but their effect seems moderate.
How does the combination of multiple genres in a movie affect its IMDb score?
network graph
genres_retained <- c("action", "adventure", "scifi", "thriller", "western",
"sport", "horror", "drama", "war", "crime", "biography",
"comedy", "fantasy", "history", "family", "other_genre")
high_rated_movies <- cleaned_df %>% filter(imdb_score >= 7.3) # 3rd Qu = 7.3
genre_matrix <- high_rated_movies %>% select(all_of(genres_retained))
# co-occurrence matrix calculation
co_occurrence_matrix <- as.matrix(t(genre_matrix)) %*% as.matrix(genre_matrix)
co_occurrence_df <- melt(co_occurrence_matrix) # need to convert to a dataframe
colnames(co_occurrence_df) <- c("Genre1", "Genre2", "Count")
# remove 0 and self-pairings
co_occurrence_df <- co_occurrence_df %>% filter(Count > 0 & Genre1 != Genre2)
graph <- graph_from_data_frame(co_occurrence_df, directed = FALSE)
ggraph(graph, layout = "fr") +
geom_edge_link(aes(edge_width = Count), edge_colour = "lightblue") +
geom_node_point(size = 5, color = "darkblue") +
geom_node_text(aes(label = name), vjust = 1, hjust = 1, size = 4) +
theme_void() +
labs(title = "Co-occurrence network of genres in high rated movies")
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
correlation heatmap
genre_corr_matrix <- cor(genre_matrix)
ggplot(melt(genre_corr_matrix), aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
labs(title = "Correlation between genres",
x = "Genre",
y = "Genre")
Some types of genres often co-occurring in high-rated movies. This suggests that introducing interaction terms for strongly correlated genre pairs could potentially be valuable for the model. (Action-Adventure, Action-Sci-Fi, Adventure-Sci-Fi, History-Biography, War-Biography, Thriller-Crime, Thriller-Action)
(*** other potential approach: add athother feature “genre diversity” by counting the number of genres per movie)
How does genre popularity differ across regions?
country
df_genres <- pivot_longer(cleaned_df, cols = all_of(genres_retained), names_to = "genre", values_to = "genre_indicator")
df_genres <- df_genres[df_genres$genre_indicator == 1, ]
genre_country_summary <- df_genres %>%
group_by(country, genre) %>%
summarise(count = n(), .groups = 'drop')
ggplot(genre_country_summary, aes(x = genre, y = country, fill = count)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "blue") +
theme_minimal() +
labs(title = "Heatmap of Genre Popularity Across Countries", x = "Genre", y = "Country")
continent
genre_continent_summary <- df_genres %>%
group_by(continent, genre) %>%
summarise(count = n(), .groups = 'drop')
ggplot(genre_continent_summary, aes(x = genre, y = continent, fill = count)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "blue") +
theme_minimal() +
labs(title = "Heatmap of genre popularity ccross continent", x = "Genre", y = "Continent")
Due to the impact of sample size, the findings from the heatmaps are likely to be unreliable. To handle class imbalance, we might consider techniques like weighted sampling or stratification to ensure that the influence of different regions is properly balanced in the model.
How does the presence of top 10 personnel impact IMDb scores?
imdb_scores_by_personnel <- df_dummies %>%
pivot_longer(cols = starts_with("top_"), names_to = "Personnel", values_to = "Presence")
ggplot(imdb_scores_by_personnel, aes(x = Personnel, y = imdb_score, fill = factor(Presence))) +
geom_boxplot() +
labs(title = "IMDb Scores by top 10 personnel involvement",
x = "Personnel",
y = "IMDb score",
fill = "Presence") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The involvement of top 10 personnel, particularly directors and lead actors, generally correlates with higher IMDb scores. This indicates that including top personnel as features might contribute to predictive model, supporting our approach to feature engineering for personnel-related variables.
Look into movies associated with the top 10 personnel for each predictor. Do different personnel associate with certain countries? What types of movie genres tend to have the highest average IMDb scores among these top personnel?
country_by_personnel <- df_dummies %>%
pivot_longer(cols = starts_with("top_"), names_to = "Personnel", values_to = "Presence") %>%
filter(Presence == 1) %>%
group_by(Personnel) %>%
summarise(across(starts_with("country"), sum)) # Adjust for your country columns
country_by_personnel_melted <- melt(country_by_personnel)
## Using Personnel as id variables
ggplot(country_by_personnel_melted, aes(x = Personnel, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Top Personnel Distribution Across Countries", x = "Personnel", y = "Count of Movies", fill = "Country")
top_personnels <- c("top_director", "top_actor1", "top_actor2", "top_actor3", "top_cinematographer", "top_production_company", "top_distributor")
plot_list <- list()
# Loop through each personnel
for (personnel in top_personnels) {
# Filter the dataframe for the current personnel
df_personnel <- cleaned_df %>% filter(!!sym(personnel) == 1)
# Reshape the dataframe to long format for genres
df_long <- df_personnel %>%
pivot_longer(cols = all_of(genres_retained), names_to = "genre", values_to = "genre_indicator") %>%
filter(genre_indicator == 1) # Keep only the relevant genres
genre_avg_scores <- df_long %>%
group_by(genre) %>%
summarise(avg_score = mean(imdb_score, na.rm = TRUE), .groups = 'drop') %>%
arrange(desc(avg_score)) # Sort by average score in descending order
# Create the bar plot for average IMDb scores by genre
p <- ggplot(genre_avg_scores, aes(x = reorder(genre, avg_score), y = avg_score)) +
geom_bar(stat = "identity", fill = "lightblue") +
labs(title = paste(personnel, " - Average IMDb Scores by Genre"), x = "Genre", y = "Average IMDb Score") +
theme(axis.text.x = element_text(hjust = 1)) +
coord_flip() # Flip coordinates for better readability
# Store the plot in the list
plot_list[[personnel]] <- p
}
# Display the plots in a grid with 4 columns and as many rows as needed
grid.arrange(grobs = plot_list, ncol = 4, nrow = ceiling(length(plot_list) / 4))
As mentioned above, regarding country distributions may be unreliable due to imbalanced country sample size. However, for genre preferences, some of which tend to rank high in average IMDb scores across different personnel categories (eg., history, western). This suggests that introducing interaction terms for specific pairs could potentially be valuable for the model. (top_director-adventure, top_actor1-history, top_actor2-history, top_actor3-war, top_cinematographer-western, top_production_company-western, top_distributor-history)
Top 10 lists for personnel
all_names <- list()
for (category in names(top_10_list)) {
# Extract names and join them with commas
personnel_names <- paste(names(top_10_list[[category]]), collapse = ", ")
# Print the category and the names
cat(category, ":\n", personnel_names, "\n\n")
}
## director :
## Woody Allen, Steven Spielberg, Clint Eastwood, Spike Lee, Steven Soderbergh, Martin Scorsese, Barry Levinson, Bobby Farrelly, Francis Ford Coppola, Joel Schumacher
##
## actor1 :
## Robert De Niro, Bill Murray, J.K. Simmons, Kevin Spacey, Jason Statham, Harrison Ford, Johnny Depp, Denzel Washington, Scarlett Johansson, Keanu Reeves
##
## actor2 :
## Morgan Freeman, Charlize Theron, Brad Pitt, Chazz Palminteri, Demi Moore, Meryl Streep, James Franco, Jason Flemyng, Jay Hernandez, Kate Winslet
##
## actor3 :
## Hope Davis, Ben Mendelsohn, John Heard, Robert Duvall, Steve Carell, Thomas Lennon, Anne Heche, Bob Gunton, Bruce McGill, Clifton Collins Jr.
##
## cinematographer :
## multiple, Roger Deakins, Mark Irwin, John Bailey, Andrew Dunn, Jack N. Green, Matthew F. Leonetti, Robert Elswit, Dean Cundey, Don Burgess
##
## production_company :
## Universal Pictures, Paramount Pictures, Columbia Pictures Corporation, Warner Bros., New Line Cinema, Twentieth Century Fox, Metro-Goldwyn-Mayer (MGM), Touchstone Pictures, DreamWorks, Miramax
##
## distributor :
## Warner Bros., Universal Pictures, Paramount Pictures, Twentieth Century Fox, Columbia Pictures Corporation, New Line Cinema, Buena Vista Pictures, Miramax, United Artists, Metro-Goldwyn-Mayer (MGM)
colnames(df_dummies)
## [1] "imdb_score" "movie_budget"
## [3] "duration" "aspect_ratio"
## [5] "nb_news_articles" "actor1_star_meter"
## [7] "actor2_star_meter" "actor3_star_meter"
## [9] "nb_faces" "action"
## [11] "adventure" "scifi"
## [13] "thriller" "western"
## [15] "sport" "horror"
## [17] "drama" "war"
## [19] "crime" "movie_meter_IMDBpro"
## [21] "biography" "comedy"
## [23] "sci-fi" "fantasy"
## [25] "history" "family"
## [27] "top_director" "top_actor1"
## [29] "top_actor2" "top_actor3"
## [31] "top_cinematographer" "top_production_company"
## [33] "top_distributor" "movie_budget_log"
## [35] "nb_faces_log" "nb_news_articles_log"
## [37] "movie_meter_IMDBpro_log" "actor1_star_meter_log"
## [39] "actor2_star_meter_log" "actor3_star_meter_log"
## [41] "maturity_rating.PG" "maturity_rating.R"
## [43] "country.Canada" "country.France"
## [45] "country.Germany" "country.UK"
## [47] "country.USA" "continent.Europe"
## [49] "continent.North America" "release_day_group.Early Month"
## [51] "release_day_group.Late Month" "day_of_week.Sun"
## [53] "day_of_week.Tue" "day_of_week.Wed"
## [55] "day_of_week.Thu" "day_of_week.Fri"
## [57] "day_of_week.Sat" "weekday_or_weekend.Weekend"